Read in data

lifeExpectancy <- read.csv('/Users/laurenflemmer/Desktop/life_expectancy_proj/Life Expectancy Data.csv')

lifeExpectancy <- lifeExpectancy %>% drop_na()

head(lifeExpectancy, 5)
##       Country Year     Status Life.expectancy Adult.Mortality infant.deaths
## 1 Afghanistan 2015 Developing            65.0             263            62
## 2 Afghanistan 2014 Developing            59.9             271            64
## 3 Afghanistan 2013 Developing            59.9             268            66
## 4 Afghanistan 2012 Developing            59.5             272            69
## 5 Afghanistan 2011 Developing            59.2             275            71
##   Alcohol percentage.expenditure Hepatitis.B Measles  BMI under.five.deaths
## 1    0.01              71.279624          65    1154 19.1                83
## 2    0.01              73.523582          62     492 18.6                86
## 3    0.01              73.219243          64     430 18.1                89
## 4    0.01              78.184215          67    2787 17.6                93
## 5    0.01               7.097109          68    3013 17.2                97
##   Polio Total.expenditure Diphtheria HIV.AIDS       GDP Population
## 1     6              8.16         65      0.1 584.25921   33736494
## 2    58              8.18         62      0.1 612.69651     327582
## 3    62              8.13         64      0.1 631.74498   31731688
## 4    67              8.52         67      0.1 669.95900    3696958
## 5    68              7.87         68      0.1  63.53723    2978599
##   thinness..1.19.years thinness.5.9.years Income.composition.of.resources
## 1                 17.2               17.3                           0.479
## 2                 17.5               17.5                           0.476
## 3                 17.7               17.7                           0.470
## 4                 17.9               18.0                           0.463
## 5                 18.2               18.2                           0.454
##   Schooling
## 1      10.1
## 2      10.0
## 3       9.9
## 4       9.8
## 5       9.5
colSums(is.na(lifeExpectancy))
##                         Country                            Year 
##                               0                               0 
##                          Status                 Life.expectancy 
##                               0                               0 
##                 Adult.Mortality                   infant.deaths 
##                               0                               0 
##                         Alcohol          percentage.expenditure 
##                               0                               0 
##                     Hepatitis.B                         Measles 
##                               0                               0 
##                             BMI               under.five.deaths 
##                               0                               0 
##                           Polio               Total.expenditure 
##                               0                               0 
##                      Diphtheria                        HIV.AIDS 
##                               0                               0 
##                             GDP                      Population 
##                               0                               0 
##            thinness..1.19.years              thinness.5.9.years 
##                               0                               0 
## Income.composition.of.resources                       Schooling 
##                               0                               0

To determine which model is appropriate, plot diagnostics

par(mfrow=c(2,2))
# diagnostic plots
basic_model <- lm(Life.expectancy ~ ., data = lifeExpectancy)
plot(basic_model)

# plot response
ggplot(data=lifeExpectancy) +
geom_histogram(mapping=aes(x=Life.expectancy), bins=10) +
ggtitle("Distribution of life expectancy")

ggplot(data=lifeExpectancy) +
geom_boxplot(mapping=aes(x=Country, y=Life.expectancy, color=Country), bins=10) +
ggtitle("Life Expectancy by Country") + 
theme(legend.position = "none") +
coord_flip() + 
theme(axis.text.x = element_text(angle = 80, vjust = 0.5, hjust=1, size=0.01))

# correlation plot
# include only continuous vars
# removed percentage expenditure bc redundant
continuous_vars <- lifeExpectancy %>% select(Year, Life.expectancy, 
                                             Adult.Mortality, infant.deaths, 
                                             Alcohol, Hepatitis.B, Measles, BMI,
                                             under.five.deaths, Polio, Total.expenditure,
                                             Diphtheria, HIV.AIDS, GDP, Population, 
                                             thinness..1.19.years, thinness.5.9.years,
                                             Income.composition.of.resources, Schooling)


corMat <- cor(continuous_vars)
corrplot(corMat, type="lower", method="circle",  tl.cex = 0.75)

corMat
##                                         Year Life.expectancy Adult.Mortality
## Year                             1.000000000      0.05077103    -0.037091782
## Life.expectancy                  0.050771035      1.00000000    -0.702523062
## Adult.Mortality                 -0.037091782     -0.70252306     1.000000000
## infant.deaths                    0.008029128     -0.16907380     0.042450237
## Alcohol                         -0.113364764      0.40271832    -0.175535086
## Hepatitis.B                      0.114897092      0.19993528    -0.105225443
## Measles                         -0.053822046     -0.06888122    -0.003966685
## BMI                              0.005739061      0.54204159    -0.351542478
## under.five.deaths                0.010478594     -0.19226530     0.060365026
## Polio                           -0.016698803      0.32729440    -0.199853000
## Total.expenditure                0.059492777      0.17471764    -0.085226535
## Diphtheria                       0.029640586      0.34133123    -0.191428759
## HIV.AIDS                        -0.123404990     -0.59223629     0.550690745
## GDP                              0.096421485      0.44132181    -0.255034733
## Population                       0.012566893     -0.02230498    -0.015011838
## thinness..1.19.years             0.019756611     -0.45783819     0.272230044
## thinness.5.9.years               0.014122422     -0.45750829     0.286722882
## Income.composition.of.resources  0.122891780      0.72108259    -0.442203288
## Schooling                        0.088731787      0.72763003    -0.421170523
##                                 infant.deaths     Alcohol Hepatitis.B
## Year                              0.008029128 -0.11336476  0.11489709
## Life.expectancy                  -0.169073804  0.40271832  0.19993528
## Adult.Mortality                   0.042450237 -0.17553509 -0.10522544
## infant.deaths                     1.000000000 -0.10621692 -0.23176894
## Alcohol                          -0.106216917  1.00000000  0.10988939
## Hepatitis.B                      -0.231768937  0.10988939  1.00000000
## Measles                           0.532679832 -0.05011023 -0.12479999
## BMI                              -0.234425154  0.35339621  0.14330179
## under.five.deaths                 0.996905622 -0.10108216 -0.24076603
## Polio                            -0.156928805  0.24031453  0.46333080
## Total.expenditure                -0.146951117  0.21488509  0.11332668
## Diphtheria                       -0.161871004  0.24295143  0.58898993
## HIV.AIDS                          0.007711547 -0.02711264 -0.09480197
## GDP                              -0.098092020  0.44343279  0.04184950
## Population                        0.671758310 -0.02888023 -0.12972265
## thinness..1.19.years              0.463415256 -0.40375499 -0.12940595
## thinness.5.9.years                0.461907925 -0.38620819 -0.13325099
## Income.composition.of.resources  -0.134753863  0.56107433  0.18492097
## Schooling                        -0.214371900  0.61697481  0.21518159
##                                      Measles          BMI under.five.deaths
## Year                            -0.053822046  0.005739061        0.01047859
## Life.expectancy                 -0.068881222  0.542041588       -0.19226530
## Adult.Mortality                 -0.003966685 -0.351542478        0.06036503
## infant.deaths                    0.532679832 -0.234425154        0.99690562
## Alcohol                         -0.050110235  0.353396205       -0.10108216
## Hepatitis.B                     -0.124799993  0.143301786       -0.24076603
## Measles                          1.000000000 -0.153245464        0.51750556
## BMI                             -0.153245464  1.000000000       -0.24213740
## under.five.deaths                0.517505563 -0.242137398        1.00000000
## Polio                           -0.057850133  0.186267965       -0.17116419
## Total.expenditure               -0.113582738  0.189468964       -0.14580310
## Diphtheria                      -0.058605907  0.176294503       -0.17844819
## HIV.AIDS                        -0.003521854 -0.210896746        0.01947593
## GDP                             -0.064767590  0.266113973       -0.10033126
## Population                       0.321946377 -0.081415982        0.65867969
## thinness..1.19.years             0.180641506 -0.547017514        0.46478470
## thinness.5.9.years               0.174946217 -0.554093981        0.46228938
## Income.composition.of.resources -0.058277256  0.510504831       -0.14809728
## Schooling                       -0.115660481  0.554843903       -0.22601262
##                                       Polio Total.expenditure  Diphtheria
## Year                            -0.01669880        0.05949278  0.02964059
## Life.expectancy                  0.32729440        0.17471764  0.34133123
## Adult.Mortality                 -0.19985300       -0.08522653 -0.19142876
## infant.deaths                   -0.15692881       -0.14695112 -0.16187100
## Alcohol                          0.24031453        0.21488509  0.24295143
## Hepatitis.B                      0.46333080        0.11332668  0.58898993
## Measles                         -0.05785013       -0.11358274 -0.05860591
## BMI                              0.18626797        0.18946896  0.17629450
## under.five.deaths               -0.17116419       -0.14580310 -0.17844819
## Polio                            1.00000000        0.11976798  0.60924547
## Total.expenditure                0.11976798        1.00000000  0.12991481
## Diphtheria                       0.60924547        0.12991481  1.00000000
## HIV.AIDS                        -0.10788547        0.04310066 -0.11760107
## GDP                              0.15680869        0.18037347  0.15843774
## Population                      -0.04538657       -0.07996224 -0.03989754
## thinness..1.19.years            -0.16406959       -0.20987232 -0.18724165
## thinness.5.9.years              -0.17448925       -0.21786479 -0.18095238
## Income.composition.of.resources  0.31468159        0.18365319  0.34326177
## Schooling                        0.35014660        0.24378345  0.35039793
##                                     HIV.AIDS         GDP   Population
## Year                            -0.123404990  0.09642148  0.012566893
## Life.expectancy                 -0.592236293  0.44132181 -0.022304978
## Adult.Mortality                  0.550690745 -0.25503473 -0.015011838
## infant.deaths                    0.007711547 -0.09809202  0.671758310
## Alcohol                         -0.027112636  0.44343279 -0.028880232
## Hepatitis.B                     -0.094801971  0.04184950 -0.129722655
## Measles                         -0.003521854 -0.06476759  0.321946377
## BMI                             -0.210896746  0.26611397 -0.081415982
## under.five.deaths                0.019475927 -0.10033126  0.658679691
## Polio                           -0.107885468  0.15680869 -0.045386572
## Total.expenditure                0.043100657  0.18037347 -0.079962237
## Diphtheria                      -0.117601074  0.15843774 -0.039897537
## HIV.AIDS                         1.000000000 -0.10808060 -0.027800562
## GDP                             -0.108080600  1.00000000 -0.020368964
## Population                      -0.027800562 -0.02036896  1.000000000
## thinness..1.19.years             0.172591767 -0.27749835  0.282529280
## thinness.5.9.years               0.183146727 -0.27795855  0.277913374
## Income.composition.of.resources -0.248589855  0.44685551 -0.008132466
## Schooling                       -0.211840201  0.46794697 -0.040312419
##                                 thinness..1.19.years thinness.5.9.years
## Year                                      0.01975661         0.01412242
## Life.expectancy                          -0.45783819        -0.45750829
## Adult.Mortality                           0.27223004         0.28672288
## infant.deaths                             0.46341526         0.46190792
## Alcohol                                  -0.40375499        -0.38620819
## Hepatitis.B                              -0.12940595        -0.13325099
## Measles                                   0.18064151         0.17494622
## BMI                                      -0.54701751        -0.55409398
## under.five.deaths                         0.46478470         0.46228938
## Polio                                    -0.16406959        -0.17448925
## Total.expenditure                        -0.20987232        -0.21786479
## Diphtheria                               -0.18724165        -0.18095238
## HIV.AIDS                                  0.17259177         0.18314673
## GDP                                      -0.27749835        -0.27795855
## Population                                0.28252928         0.27791337
## thinness..1.19.years                      1.00000000         0.92791344
## thinness.5.9.years                        0.92791344         1.00000000
## Income.composition.of.resources          -0.45367885        -0.43848372
## Schooling                                -0.49119921        -0.47248203
##                                 Income.composition.of.resources   Schooling
## Year                                                0.122891780  0.08873179
## Life.expectancy                                     0.721082593  0.72763003
## Adult.Mortality                                    -0.442203288 -0.42117052
## infant.deaths                                      -0.134753863 -0.21437190
## Alcohol                                             0.561074332  0.61697481
## Hepatitis.B                                         0.184920970  0.21518159
## Measles                                            -0.058277256 -0.11566048
## BMI                                                 0.510504831  0.55484390
## under.five.deaths                                  -0.148097276 -0.22601262
## Polio                                               0.314681594  0.35014660
## Total.expenditure                                   0.183653190  0.24378345
## Diphtheria                                          0.343261772  0.35039793
## HIV.AIDS                                           -0.248589855 -0.21184020
## GDP                                                 0.446855511  0.46794697
## Population                                         -0.008132466 -0.04031242
## thinness..1.19.years                               -0.453678854 -0.49119921
## thinness.5.9.years                                 -0.438483721 -0.47248203
## Income.composition.of.resources                     1.000000000  0.78474058
## Schooling                                           0.784740581  1.00000000
Variables correlated with life expectancy (|r| > 0.4):
  • Adult mortality rate
  • Alcohol intake
  • BMI
  • HIV/AIDS
  • GDP
  • Prevalence of thinness among ages 10-19
  • Prevalence in thinness among ages 5-9
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling

EDA for variables correlated with life expectancy

# adult mortality vs. life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=Adult.Mortality, y=Life.expectancy, color=Status)) + 
  ggtitle("Adult Mortality Rate vs. Life Expectancy") +
  xlab("Adult Mortality Rate (deaths per 1,000 individuals per year)") + 
  ylab("Life Expectancy")

# alcohol intake vs life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=Alcohol, y=Life.expectancy, color=Status)) + 
  ggtitle("Alcohol Intake vs. Life Expectancy") +
  xlab("Alcohol Intake (average liters per individual)") + 
  ylab("Life Expectancy")

# bmi vs. life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=BMI, y=Life.expectancy, color=Status)) + 
  ggtitle("BMI vs. Life Expectancy") +
  xlab("BMI") + 
  ylab("Life Expectancy")

# hiv/aids vs life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=HIV.AIDS, y=Life.expectancy, color=Status)) + 
  ggtitle("HIV/AIDS deaths vs. Life Expectancy") +
  xlab("HIV/AIDS Deaths (0-4 years)") + 
  ylab("Life Expectancy")

# gdp vs life expectancy
gdp_plot <- ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=GDP, y=Life.expectancy, color=Status)) + 
  ggtitle("GDP vs. Life Expectancy") +
  xlab("GDP per capita (USD)") + 
  ylab("Life Expectancy")

gdp_log_plot <- ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=log(GDP), y=Life.expectancy, color=Status)) + 
  ggtitle("log(GDP) vs. Life Expectancy") +
  xlab("log(GDP) per capita (USD)") + 
  ylab("Life Expectancy")

grid.arrange(gdp_plot, gdp_log_plot)

# prevalence of thinness (ages 10-19)
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=thinness..1.19.years, y=Life.expectancy, color=Status)) + 
  ggtitle("Prevalence of Thinness (ages 10-19) vs. Life Expectancy") +
  xlab("Prevalence of Thinness for ages 10-19 (%)") + 
  ylab("Life Expectancy")

# prevalence of thinness (ages 5-9)
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=thinness.5.9.years, y=Life.expectancy, color=Status)) + 
  ggtitle("Prevalence of Thinness (ages 5-9) vs. Life Expectancy") +
  xlab("Prevalence of Thinness for ages 5-9 (%)") + 
  ylab("Life Expectancy")

# human development index vs. life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=Income.composition.of.resources, y=Life.expectancy, color=Status)) + 
  ggtitle("Human Development Index vs. Life Expectancy") +
  xlab("Human Development Index") + 
  ylab("Life Expectancy")

# years of schooling vs. life expectancy
ggplot(data=lifeExpectancy) +
  geom_point(mapping=aes(x=Schooling, y=Life.expectancy, color=Status)) + 
  ggtitle("Schooling vs. Life Expectancy") +
  xlab("Years of Schooling") + 
  ylab("Life Expectancy")

EDA for non-continuous variables

# status vs. life expectancy
developing_expectancy <- lifeExpectancy %>% filter(Status == "Developing") %>% select(Life.expectancy)
developed_expectancy <- lifeExpectancy %>% filter(Status == "Developed") %>% select(Life.expectancy)

boxplot_developing <- ggplot(data=developing_expectancy) +
  geom_boxplot(mapping=aes(x=Life.expectancy)) + 
  ggtitle("Life Expectancy for Developing Countries") +
  xlab("Life Expectancy")

boxplot_developed <- ggplot(data=developed_expectancy) +
  geom_boxplot(mapping=aes(x=Life.expectancy)) + 
  ggtitle("Life Expectancy for Developed Countries") +
  xlab("Life Expectancy")

grid.arrange(boxplot_developing, boxplot_developed)

Initial modeling using ‘geeglm’ package

Model 1 (Independent correlation structure, gaussian) Predictors:

  • Status (Developed/Developing)
  • Adult mortality rate
  • Alcohol intake
  • BMI
  • HIV/AIDS
  • log(GDP)
  • Prevalence of thinness among ages 10-19
  • Prevalence in thinness among ages 5-9
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling
# convert country to factor
lifeExpectancy$Country <- as.factor(lifeExpectancy$Country)

# convert HDI from 0-1 to 0-100 for better interpretability of model estimates
lifeExpectancy$Income.composition.of.resources <- lifeExpectancy$Income.composition.of.resources * 100
# fit initial model
initialModel <- geeglm(formula = Life.expectancy ~ Status + Year + Adult.Mortality + Alcohol + BMI 
                              + HIV.AIDS + log(GDP) + thinness..1.19.years + thinness.5.9.years
                              + Income.composition.of.resources + Schooling, id = Country, 
                              data = lifeExpectancy, family = "gaussian")
summary(initialModel)$coefficients
##                                     Estimate      Std.err       Wald
## (Intercept)                     358.66456721 75.453919934 22.5950515
## StatusDeveloping                 -1.36926240  0.819073000  2.7946546
## Year                             -0.15247061  0.038030626 16.0733104
## Adult.Mortality                  -0.01748929  0.002305808 57.5305255
## Alcohol                          -0.18849973  0.085152151  4.9003850
## BMI                               0.02743665  0.010028907  7.4843649
## HIV.AIDS                         -0.44441915  0.049598995 80.2859892
## log(GDP)                          0.49802463  0.095407183 27.2483054
## thinness..1.19.years             -0.03846752  0.068831247  0.3123324
## thinness.5.9.years               -0.03477710  0.057004658  0.3721912
## Income.composition.of.resources   0.11102774  0.027585366 16.1996464
## Schooling                         0.93039398  0.164142456 32.1286090
##                                     Pr(>|W|)
## (Intercept)                     1.999991e-06
## StatusDeveloping                9.457914e-02
## Year                            6.093684e-05
## Adult.Mortality                 3.330669e-14
## Alcohol                         2.685071e-02
## BMI                             6.223702e-03
## HIV.AIDS                        0.000000e+00
## log(GDP)                        1.789319e-07
## thinness..1.19.years            5.762524e-01
## thinness.5.9.years              5.418123e-01
## Income.composition.of.resources 5.700475e-05
## Schooling                       1.442967e-08
# plot residuals
initialResid <- as.vector(initialModel$residuals)
mean_abs_resid_initial <- mean(abs(initialResid))

ggplot() +
  geom_histogram(aes(initialResid)) +
  ggtitle(label = "Residuals of initial model \n(independent correlation structure, gaussian)",
      subtitle = paste("Mean absolute error: ", mean_abs_resid_initial))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Model 2 (Independent correlation structure) Predictors:

(Backward selection done using Robust Z)
  • Status (Developed/Developing)
  • Adult mortality rate
  • Alcohol intake
  • BMI
  • HIV/AIDS
  • log(GDP)
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling
# fit second model
secondModel <- geeglm(formula = Life.expectancy ~ Status + Year + Adult.Mortality + Alcohol + BMI + 
                                HIV.AIDS + log(GDP) + Income.composition.of.resources + Schooling, 
                                id = Country, data = lifeExpectancy, family = "gaussian")
summary(secondModel)$coefficients
##                                     Estimate     Std.err      Wald     Pr(>|W|)
## (Intercept)                     362.17234816 74.85685334 23.408169 1.310210e-06
## StatusDeveloping                 -1.37665316  0.82975532  2.752637 9.709416e-02
## Year                             -0.15465680  0.03767976 16.846968 4.051801e-05
## Adult.Mortality                  -0.01753501  0.00229392 58.432646 2.098322e-14
## Alcohol                          -0.17648021  0.08527053  4.283454 3.848506e-02
## BMI                               0.03410873  0.01044020 10.673663 1.086717e-03
## HIV.AIDS                         -0.44699537  0.04902540 83.131154 0.000000e+00
## log(GDP)                          0.49770594  0.09523882 27.309744 1.733357e-07
## Income.composition.of.resources   0.11214319  0.02707574 17.154775 3.445422e-05
## Schooling                         0.94413800  0.15991610 34.856724 3.548818e-09
# plot residuals
secondResid <- as.vector(secondModel$residuals)
mean_abs_resid_second <- mean(abs(secondResid))

ggplot() +
  geom_histogram(aes(secondResid)) +
  ggtitle(label = "Residuals of second model \n(independent correlation structure)",
      subtitle = paste("Mean absolute error: ", mean_abs_resid_second))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

“AR1” correlation

Model 1 (AR1 correlation structure)Predictors:

  • Status (Developed/Developing)
  • Adult mortality rate
  • Alcohol intake
  • BMI
  • HIV/AIDS
  • log(GDP)
  • Prevalence of thinness among ages 10-19
  • Prevalence in thinness among ages 5-9
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling
# fit initial model
initialModel_ar1 <- geeglm(formula = Life.expectancy ~ Status + Year + Adult.Mortality + Alcohol + BMI 
                              + HIV.AIDS + log(GDP) + thinness..1.19.years + thinness.5.9.years
                              + Income.composition.of.resources + Schooling, id = Country, 
                              data = lifeExpectancy, corstr = "ar1", family = "gaussian")
summary(initialModel_ar1)$coefficients
##                                      Estimate      Std.err         Wald
## (Intercept)                     -8.615082e+01 5.706060e+01  2.279534421
## StatusDeveloping                -5.824818e+00 1.040977e+00 31.309973394
## Year                             7.304077e-02 2.906636e-02  6.314649336
## Adult.Mortality                 -7.352253e-04 5.368132e-04  1.875834428
## Alcohol                         -5.622125e-03 2.282424e-02  0.060674809
## BMI                              1.463369e-03 2.999144e-03  0.238074626
## HIV.AIDS                        -4.078279e-01 5.870129e-02 48.267915457
## log(GDP)                         1.952508e-03 3.141030e-02  0.003864038
## thinness..1.19.years            -4.008282e-02 3.277668e-02  1.495500857
## thinness.5.9.years               1.521806e-02 2.768839e-02  0.302080819
## Income.composition.of.resources  1.683596e-02 4.707713e-03 12.789572690
## Schooling                        1.103759e+00 1.234923e-01 79.885624828
##                                     Pr(>|W|)
## (Intercept)                     1.310912e-01
## StatusDeveloping                2.199468e-08
## Year                            1.197444e-02
## Adult.Mortality                 1.708083e-01
## Alcohol                         8.054325e-01
## BMI                             6.256002e-01
## HIV.AIDS                        3.717804e-12
## log(GDP)                        9.504343e-01
## thinness..1.19.years            2.213649e-01
## thinness.5.9.years              5.825809e-01
## Income.composition.of.resources 3.485567e-04
## Schooling                       0.000000e+00
# plot residuals
initialResid_ar1 <- as.vector(initialModel_ar1$residuals)
mean_abs_resid_initial_ar1 <- mean(abs(initialResid_ar1))

ggplot() +
  geom_histogram(aes(initialResid_ar1)) +
  ggtitle(label = "Residuals of initial model \n(AR1 correlation structure)",
      subtitle = paste("Mean absolute error: ", mean_abs_resid_initial_ar1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Using AR1 Correlation structure, perform backwards selection of predictors

Model 2 (AR1 Correlation Structure) Predictors:

  • Status (Developed/Developing)
  • Year
  • HIV/AIDS
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling
# fit initial model
model2_ar1 <- geeglm(formula = Life.expectancy ~ Status + Year + HIV.AIDS
                              + Income.composition.of.resources + Schooling, id = Country, 
                              data = lifeExpectancy, corstr = "ar1", family = "gaussian")
summary(model2_ar1)$coefficients
##                                     Estimate      Std.err      Wald
## (Intercept)                     -92.69412291 56.417504072  2.699459
## StatusDeveloping                 -5.95221369  1.031812649 33.277856
## Year                              0.07619169  0.028783899  7.006739
## HIV.AIDS                         -0.40892008  0.060509651 45.669640
## Income.composition.of.resources   0.01685699  0.004582926 13.529264
## Schooling                         1.11410328  0.122475875 82.746532
##                                     Pr(>|W|)
## (Intercept)                     1.003823e-01
## StatusDeveloping                7.988718e-09
## Year                            8.120345e-03
## HIV.AIDS                        1.399758e-11
## Income.composition.of.resources 2.348722e-04
## Schooling                       0.000000e+00
# plot residuals
model2Resid_ar1 <- as.vector(model2_ar1$residuals)
mean_abs_resid_model2_ar1 <- mean(abs(model2Resid_ar1))

ggplot() +
  geom_histogram(aes(model2Resid_ar1)) +
  ggtitle(label = "Residuals of Model 2 \n(AR1 correlation structure)",
      subtitle = paste("Mean absolute error: ", mean_abs_resid_model2_ar1))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare Independent vs. AR-1 Standard Error:

(Same predictors in both models)
  • Status (Developed/Developing)
  • Adult mortality rate
  • HIV/AIDS
  • Human Development Index (in terms of income composition of resources)
  • Number of years of schooling
# fit independent model w/ same predictors as final AR-1 model to compare standard errors
testIndepModel <- geeglm(formula = Life.expectancy ~ Status + Year + HIV.AIDS
                                + Income.composition.of.resources + Schooling, 
                                id = Country, data = lifeExpectancy, family = "gaussian")

summary(testIndepModel)$coefficients
##                                    Estimate     Std.err      Wald     Pr(>|W|)
## (Intercept)                     360.5058009 84.26742318 18.302275 1.884819e-05
## StatusDeveloping                 -1.2318027  0.97380582  1.600065 2.058940e-01
## Year                             -0.1556290  0.04253417 13.387679 2.532826e-04
## HIV.AIDS                         -0.6413125  0.08679507 54.594540 1.481038e-13
## Income.composition.of.resources   0.1434149  0.03599286 15.876533 6.761189e-05
## Schooling                         1.1997387  0.20442289 34.444057 4.386801e-09

Using AR-1 Model 2, Examine Predicted vs. Actual Values

fitted_vals <- model2_ar1$fitted.values
actual_vals <- lifeExpectancy$Life.expectancy
actual_fitted_df <- as.data.frame(cbind(fitted_vals, actual_vals, model2Resid_ar1))
names(actual_fitted_df)[1] <- "fitted_vals"
names(actual_fitted_df)[3] <- "Residual"


ggplot(actual_fitted_df) +
  geom_point(aes(x=actual_vals, fitted_vals, color=Residual)) +
  ggtitle("Actual vs. Fitted Values for AR-1 Model 2") +
  xlab("Actual Life Expectancy") +
  ylab("Predicted Life Expectancy")

Feature Importance in AR-1 Model 2

feature_names <- c("Development \n Status", "Year", "HIV/AIDS Deaths", "Human Development  \n Index", "Years of Schooling")
abs_coeff <- abs(model2_ar1$coefficients[-1])

ggplot() +
  geom_col(aes(x=feature_names, y=abs_coeff), fill = "dark blue") +
  ggtitle("Feature Importance of AR-1 Model 2") +
  xlab("Feature") +
  ylab("Absolute Value of Coefficient")


Exploring new clustering based on similarity of countries

K-Means to create new country clusters

kmeans_vars <- continuous_vars[, c(-1, -2, -18)]

k_vec <- 3:30
tot_within_SS <- c()

# Cross-Validate to find the best K, which minimizes the total within cluster sum of squares
for (k in k_vec) {
  
  clustering_k <- kmeans(kmeans_vars, centers=k, nstart=10, iter.max=100)
  tot_within_SS <- c(tot_within_SS, clustering_k$tot.withinss)

}

# plot K vs. total within-cluster SS
ggplot() +
  geom_line(aes(x=k_vec, y=tot_within_SS), color="blue", lwd = 1) +
  ggtitle("Total Within-Cluster Sum of Squares for K = (3, 4, ..., 30)") + 
  xlab("K") +
  ylab("Total Within-Cluster SS")

Optimal number of clusters: K = 10

# perform final clustering
kmeans10 <- kmeans(kmeans_vars, centers=10, nstart=10)
new_clusters <- factor(kmeans10$cluster)

Compare human development index values for cluster members

HDI <- lifeExpectancy$Income.composition.of.resources

# get countries belonging to each cluster
lifeExpectancy$cluster <- new_clusters
countries_cluster <- lifeExpectancy %>% group_by(cluster) %>% summarise(countries = str_c(unique(Country), collapse = ", "))

ggplot() +
  geom_boxplot(aes(x=new_clusters, y=HDI, fill=new_clusters)) +
  ggtitle("Human Development Index of New Cluster Members") +
  xlab("Cluster")

Countries in Cluster 1: Algeria, Argentina, Colombia, Italy, Kenya, Myanmar, Poland, South Africa, Spain, Uganda, Ukraine
Countries in Cluster 2: Afghanistan, Albania, Algeria, Angola, Argentina, Armenia, Australia, Belgium, Bosnia and Herzegovina, Botswana, Brazil, Cameroon, Canada, Central African Republic, Colombia, Costa Rica, Croatia, Eritrea, Ghana, Greece, Indonesia, Iraq, Ireland, Jamaica, Jordan, Kenya, Latvia, Lebanon, Lesotho, Liberia, Lithuania, Madagascar, Malaysia, Mauritania, Mongolia, Morocco, Mozambique, Myanmar, Namibia, Nepal, Nicaragua, Panama, Papua New Guinea, Peru, Poland, Romania, Senegal, South Africa, Spain, Syrian Arab Republic, Tunisia, Turkmenistan, Uganda, Ukraine, Uruguay, Uzbekistan
Countries in Cluster 3: Ethiopia, France, Germany, Philippines, Thailand, Turkey
Countries in Cluster 4: Afghanistan, Algeria, Angola, Australia, Brazil, Canada, Ghana, Indonesia, Iraq, Madagascar, Malaysia, Morocco, Mozambique, Nepal, Peru, Romania, Uganda
Countries in Cluster 5: Brazil, Indonesia, Pakistan
Countries in Cluster 6: Austria, Azerbaijan, Belarus, Belgium, Benin, Bulgaria, Burundi, Chad, Dominican Republic, El Salvador, France, Germany, Greece, Guinea, Honduras, Italy, Jordan, Mexico, Myanmar, Nicaragua, Papua New Guinea, Paraguay, Philippines, Rwanda, Senegal, Serbia, Sierra Leone, South Africa, Sweden, Tajikistan, Thailand, Togo, Tunisia, Turkey
Countries in Cluster 7: Bangladesh, India, Mexico, Nigeria, Russian Federation
Countries in Cluster 8: Afghanistan, Bangladesh, Brazil, Burkina Faso, Cambodia, Cameroon, Chad, Chile, Ecuador, Ghana, Guatemala, India, Indonesia, Kazakhstan, Madagascar, Malawi, Mali, Mexico, Mozambique, Netherlands, Niger, Nigeria, Pakistan, Romania, Russian Federation, Senegal, Syrian Arab Republic, Zambia, Zimbabwe
Countries in Cluster 9: Afghanistan, Albania, Algeria, Angola, Argentina, Armenia, Australia, Austria, Azerbaijan, Bangladesh, Belarus, Belgium, Belize, Benin, Bhutan, Bosnia and Herzegovina, Botswana, Brazil, Bulgaria, Burkina Faso, Burundi, Cabo Verde, Cambodia, Cameroon, Canada, Central African Republic, Chad, Chile, China, Colombia, Comoros, Costa Rica, Croatia, Cyprus, Djibouti, Dominican Republic, Ecuador, El Salvador, Equatorial Guinea, Eritrea, Estonia, Ethiopia, Fiji, France, Gabon, Georgia, Germany, Ghana, Greece, Guatemala, Guinea, Guinea-Bissau, Guyana, Haiti, Honduras, Iraq, Ireland, Israel, Italy, Jamaica, Jordan, Kazakhstan, Kenya, Kiribati, Latvia, Lebanon, Lesotho, Liberia, Lithuania, Luxembourg, Madagascar, Malawi, Malaysia, Maldives, Mali, Malta, Mauritania, Mauritius, Mexico, Mongolia, Montenegro, Morocco, Mozambique, Myanmar, Namibia, Netherlands, Nicaragua, Niger, Nigeria, Pakistan, Panama, Papua New Guinea, Paraguay, Peru, Philippines, Poland, Portugal, Romania, Russian Federation, Rwanda, Samoa, Sao Tome and Principe, Senegal, Serbia, Seychelles, Sierra Leone, Solomon Islands, South Africa, Spain, Sri Lanka, Suriname, Swaziland, Sweden, Syrian Arab Republic, Tajikistan, Thailand, Timor-Leste, Togo, Tonga, Trinidad and Tobago, Tunisia, Turkey, Turkmenistan, Uganda, Ukraine, Uruguay, Uzbekistan, Vanuatu, Zambia, Zimbabwe
Countries in Cluster 10: India

Fit Model 1 using new clusters

Model 1 (Unstructured correlation structure, clustering by K-Means clusters):

# fit initial model
initialModel_clusters <- geeglm(formula = Life.expectancy ~ Status + Year + Adult.Mortality + Alcohol + BMI 
                              + HIV.AIDS + log(GDP) + thinness..1.19.years + thinness.5.9.years
                              + Income.composition.of.resources + Schooling, id = cluster, 
                              data = lifeExpectancy, corstr = "unstructured", family = "gaussian")
summary(initialModel_clusters)$coefficients
##                                      Estimate      Std.err       Wald
## (Intercept)                     412.836364926 3.869385e+02  1.1383400
## StatusDeveloping                 -1.473929273 7.888183e-01  3.4913980
## Year                             -0.181148442 1.923979e-01  0.8864789
## Adult.Mortality                  -0.008920334 2.079018e-03 18.4096550
## Alcohol                          -0.236824951 7.681688e-02  9.5047690
## BMI                               0.029508709 2.773239e-02  1.1322070
## HIV.AIDS                         -0.577117408 6.453359e-02 79.9754470
## log(GDP)                          0.316827281 3.834731e-01  0.6826143
## thinness..1.19.years             -0.048453554 6.154738e-02  0.6197723
## thinness.5.9.years               -0.052582088 6.310708e-02  0.6942558
## Income.composition.of.resources   0.123730217 6.308320e-02  3.8470175
## Schooling                         1.169007353 3.651510e-01 10.2491891
##                                     Pr(>|W|)
## (Intercept)                     0.2860033650
## StatusDeveloping                0.0616884688
## Year                            0.3464332873
## Adult.Mortality                 0.0000178153
## Alcohol                         0.0020493855
## BMI                             0.2873050751
## HIV.AIDS                        0.0000000000
## log(GDP)                        0.4086879505
## thinness..1.19.years            0.4311318977
## thinness.5.9.years              0.4047205777
## Income.composition.of.resources 0.0498345339
## Schooling                       0.0013674470
# plot residuals
initialResid_clusters <- as.vector(initialModel_clusters$residuals)
mean_abs_resid_initial_clusters <- mean(abs(initialResid_clusters))

ggplot() +
  geom_histogram(aes(initialResid_clusters)) +
  ggtitle(label = "Residuals of initial model \n(unstructured correlation structure, clusters from K-Means)",
      subtitle = paste("Mean absolute error: ", mean_abs_resid_initial_clusters))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Comparing standard errors from clustering by Country vs. K-Means

summary(initialModel_ar1)$coefficients
##                                      Estimate      Std.err         Wald
## (Intercept)                     -8.615082e+01 5.706060e+01  2.279534421
## StatusDeveloping                -5.824818e+00 1.040977e+00 31.309973394
## Year                             7.304077e-02 2.906636e-02  6.314649336
## Adult.Mortality                 -7.352253e-04 5.368132e-04  1.875834428
## Alcohol                         -5.622125e-03 2.282424e-02  0.060674809
## BMI                              1.463369e-03 2.999144e-03  0.238074626
## HIV.AIDS                        -4.078279e-01 5.870129e-02 48.267915457
## log(GDP)                         1.952508e-03 3.141030e-02  0.003864038
## thinness..1.19.years            -4.008282e-02 3.277668e-02  1.495500857
## thinness.5.9.years               1.521806e-02 2.768839e-02  0.302080819
## Income.composition.of.resources  1.683596e-02 4.707713e-03 12.789572690
## Schooling                        1.103759e+00 1.234923e-01 79.885624828
##                                     Pr(>|W|)
## (Intercept)                     1.310912e-01
## StatusDeveloping                2.199468e-08
## Year                            1.197444e-02
## Adult.Mortality                 1.708083e-01
## Alcohol                         8.054325e-01
## BMI                             6.256002e-01
## HIV.AIDS                        3.717804e-12
## log(GDP)                        9.504343e-01
## thinness..1.19.years            2.213649e-01
## thinness.5.9.years              5.825809e-01
## Income.composition.of.resources 3.485567e-04
## Schooling                       0.000000e+00
summary(initialModel_clusters)$coefficients
##                                      Estimate      Std.err       Wald
## (Intercept)                     412.836364926 3.869385e+02  1.1383400
## StatusDeveloping                 -1.473929273 7.888183e-01  3.4913980
## Year                             -0.181148442 1.923979e-01  0.8864789
## Adult.Mortality                  -0.008920334 2.079018e-03 18.4096550
## Alcohol                          -0.236824951 7.681688e-02  9.5047690
## BMI                               0.029508709 2.773239e-02  1.1322070
## HIV.AIDS                         -0.577117408 6.453359e-02 79.9754470
## log(GDP)                          0.316827281 3.834731e-01  0.6826143
## thinness..1.19.years             -0.048453554 6.154738e-02  0.6197723
## thinness.5.9.years               -0.052582088 6.310708e-02  0.6942558
## Income.composition.of.resources   0.123730217 6.308320e-02  3.8470175
## Schooling                         1.169007353 3.651510e-01 10.2491891
##                                     Pr(>|W|)
## (Intercept)                     0.2860033650
## StatusDeveloping                0.0616884688
## Year                            0.3464332873
## Adult.Mortality                 0.0000178153
## Alcohol                         0.0020493855
## BMI                             0.2873050751
## HIV.AIDS                        0.0000000000
## log(GDP)                        0.4086879505
## thinness..1.19.years            0.4311318977
## thinness.5.9.years              0.4047205777
## Income.composition.of.resources 0.0498345339
## Schooling                       0.0013674470